add files to compute basic stats on pseudo crawl dataset#382
add files to compute basic stats on pseudo crawl dataset#382SaulLu wants to merge 8 commits intobigscience-workshop:masterfrom
Conversation
96fcc41 to
19c004e
Compare
for more information, see https://pre-commit.ci
thomasw21
left a comment
There was a problem hiding this comment.
LGTM! A few comments but should be good.
| parser.add_argument( | ||
| "--save-batch-size", type=int, required=True, help="Batch size when writing." | ||
| ) | ||
| parser.add_argument("--use-datasets-caching", action="store_true") |
There was a problem hiding this comment.
We never us it. Let's remove it.
There was a problem hiding this comment.
In fact, it seems to me that I used use_datasets_caching
There was a problem hiding this comment.
I'll removed save-batch-size
| "--save-path-stats-json", type=str, help="Where to save the stats json." | ||
| ) | ||
| parser.add_argument( | ||
| "--save-path-stats-full-json", type=str, help="Where to save the stats json." |
There was a problem hiding this comment.
| "--save-path-stats-full-json", type=str, help="Where to save the stats json." | |
| "--save-path-stats-full-json", type=str, required=True, help="Where to save the stats json." |
| logger.info(f" --- Statistics not already computed for seed id {args.seed_id} ") | ||
| if not args.use_datasets_caching: | ||
| datasets.set_caching_enabled(False) | ||
| else: | ||
| logger.info( | ||
| f"the datasets results will be cached at {config.HF_DATASETS_CACHE}." | ||
| ) |
There was a problem hiding this comment.
| logger.info(f" --- Statistics not already computed for seed id {args.seed_id} ") | |
| if not args.use_datasets_caching: | |
| datasets.set_caching_enabled(False) | |
| else: | |
| logger.info( | |
| f"the datasets results will be cached at {config.HF_DATASETS_CACHE}." | |
| ) | |
| logger.info(f" --- Statistics not already computed for seed id {args.seed_id} ") |
There was a problem hiding this comment.
In fact, it seems to me that I used use_datasets_caching
| ] | ||
| ds_html = ds_html.map( | ||
| get_length_text, | ||
| batched=False, |
There was a problem hiding this comment.
| batched=False, | |
| batched=True, |
You just have to code a batched version of get_length_text
| with open(save_path_tmp, "w", encoding="utf-8") as f: | ||
| json.dump(data_stats, f, ensure_ascii=False, indent=4) | ||
| logger.info(f"Moving the saved dataset to {str(save_path.absolute())}") | ||
| subprocess.run(["mv", save_path_tmp, str(save_path.absolute())]) |
There was a problem hiding this comment.
Yeah I'd recomment os.rename. subprocess doesn't tell you if it fails.
| save_path = Path(args.save_path_stats_full_json) | ||
| tmp_file_name = f"tmp-{str(save_path.name)}" | ||
| save_path_tmp = os.path.join(save_path.parent, tmp_file_name) | ||
| logger.info(f"Saving the dataset at {save_path_tmp}") | ||
| ds_html.to_json( | ||
| save_path_tmp, | ||
| batch_size=args.save_batch_size, | ||
| num_proc=args.num_proc, | ||
| compression="gzip", | ||
| ) | ||
| logger.info(f"Moving the saved dataset to {str(save_path.absolute())}") |
There was a problem hiding this comment.
| save_path = Path(args.save_path_stats_full_json) | |
| tmp_file_name = f"tmp-{str(save_path.name)}" | |
| save_path_tmp = os.path.join(save_path.parent, tmp_file_name) | |
| logger.info(f"Saving the dataset at {save_path_tmp}") | |
| ds_html.to_json( | |
| save_path_tmp, | |
| batch_size=args.save_batch_size, | |
| num_proc=args.num_proc, | |
| compression="gzip", | |
| ) | |
| logger.info(f"Moving the saved dataset to {str(save_path.absolute())}") | |
| save_path_full = Path(args.save_path_stats_full_json) | |
| save_path_full_tmp = save_path_full.rename(f"{save_path_full.name}.tmp") | |
| logger.info(f"Saving the dataset at {save_path_full_tmp}") | |
| ds_html.to_json( | |
| save_path_full_tmp, | |
| batch_size=args.save_batch_size, | |
| num_proc=args.num_proc, | |
| compression="gzip", | |
| ) | |
| logger.info(f"Moving the saved dataset to {str(save_path_full.absolute())}") |
two things:
- move with
os.rename, orshutil.move - You're not scared of overiting previous full_json?
| #SBATCH --partition=cpu_p1 | ||
| #SBATCH --time 10:00:00 # maximum execution time (HH:MM:SS) | ||
| #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/compute_stats_v5/%x-%j.out # output file name | ||
| #SBATCH --array=1-604 |
There was a problem hiding this comment.
You need to add the new seeds. And I need to update the csv with the new seeds ....
|
@SaulLu can you edit and merge? Thanks! |
Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com>
Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com>
for more information, see https://pre-commit.ci
Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com>
|
@HugoLaurencon, do you need it urgently ? |
|
@SaulLu No, no worries! It was just to tag you in case you didn't see the comments from Thomas |
As discussed with @thomasw21, this PR add basic slurm and python scripts to compute an intermiadiary metadata dataset and some statistics for the Pseudo Crawl dataset